In [1]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
In [2]:
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook+vscode+pdf"
In [3]:
def create_folder(folder_name):
    if not os.path.exists(folder_name):
        print(f'Creating folder {folder_name}')
        os.mkdir(folder_name)

# to always have the newest plot versions, delete file before creating new one
def remove_file_if_exists(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)

create_folder('data/')
create_folder('data/svd')
In [4]:
# keep only needed data
def truncate_df(df, columns_to_keep, years=None, months=None, hours=None):
    # modify timestamp column
    df['date_time'] = pd.to_datetime(df['date_time'],format='%Y-%m-%d %H:%M:%S')

    # build prefix str
    prefix_str = 'columns-' + '_'.join(columns_to_keep)
    columns_to_keep = ['date_time'] + columns_to_keep + ['city']
    df = df[columns_to_keep]

    if years:
        df = df[df['date_time'].dt.year.isin(years)]
        prefix_str += '-years-' + '_'.join(map(str, years))

    if months:
        df = df[df['date_time'].dt.month.isin(months)]
        prefix_str += '-months-' + '_'.join(map(str, months))

    if hours:
        df = df[df['date_time'].dt.hour.isin(hours)]
        prefix_str += '-hours-' + '_'.join(map(str, hours))

    prefix_str = prefix_str.replace(' ', '')
    return df, prefix_str

Prepare Data for SVD Analysis¶

In [5]:
# function to create csv file used for analysis
def create_svd_df(df, column_value, outstring):
    OUTPUT_FILENAME = f'data/svd/data_svd_{outstring}.csv'
    
    unique_towns = sorted(list(df['city'].unique()))
    
    new_index = pd.to_datetime(df['date_time'],format='%Y-%m-%d %H:%M:%S').unique()
    data = np.array(df[column_value])
    data = data.reshape(len(new_index), len(unique_towns))

    ret_df = pd.DataFrame(
        data=data, 
        columns=unique_towns, 
        index=new_index, 
        dtype=float
    )
    ret_df.to_csv(OUTPUT_FILENAME)
In [6]:
# svd global variables and creating dirs
SVD_FOLDER_PATH = 'svd_plots'
create_folder(SVD_FOLDER_PATH)

FULL_DATA_FILENAME = 'data/data.csv.gz'

COLUMNS_TO_KEEP = ['humidity']
df_data = pd.read_csv(FULL_DATA_FILENAME, compression='gzip')

df_data, PREFIX_STR = truncate_df(
    df=df_data.copy(),
    columns_to_keep=COLUMNS_TO_KEEP,
    # months=[6,7,8],
)

SVD_FOLDER_PATH = f'{SVD_FOLDER_PATH}/{PREFIX_STR}'
create_folder(SVD_FOLDER_PATH)

# create reconstuction dir
# SVD_RECONSTUCTION_FOLDER_PATH = f'{SVD_FOLDER_PATH}/reconstuction'
# create_folder(SVD_RECONSTUCTION_FOLDER_PATH)
In [7]:
GEOLOCATION_FILENAME = 'data/geo_position.csv'
df_geolocation = pd.read_csv(GEOLOCATION_FILENAME).sort_values(by='CITY')
unique_towns = sorted(list(df_geolocation['CITY'].unique()))  # get unique names of towns ordered by name

SVD_DATA_FILENAME = f'data/svd/data_svd_{PREFIX_STR}.csv'

if not os.path.exists(SVD_DATA_FILENAME):
    print(f'Creating svd file for {PREFIX_STR}')
    create_svd_df(df=df_data, column_value=COLUMNS_TO_KEEP, outstring=PREFIX_STR)
Creating svd file for columns-humidity

SVD¶

In [8]:
print(f'Loading svd file for {PREFIX_STR}')
df_svd = pd.read_csv(SVD_DATA_FILENAME, index_col=0)
df_svd.index = pd.to_datetime(df_svd.index)

# df_svd = trunc_df(df_svd)
svd_A = np.array(df_svd)

# build matrix U, S, V
svd_U, svd_S, svd_V = np.linalg.svd(svd_A, full_matrices=False)
Loading svd file for columns-humidity

Precision of SVD Resconstrucion¶

In [9]:
# function to calculate and plot precision of svd reconstruction
def svd_precision(svd_S):
    SVD_PRECISION_FILENAME = f'{SVD_FOLDER_PATH}/{PREFIX_STR}_svd_precision.png'
    remove_file_if_exists(SVD_PRECISION_FILENAME)
    
    fig = go.Figure(
        data=[go.Bar(
                x=np.arange(np.size(svd_S)),
                y=np.cumsum(svd_S / np.sum(svd_S))
            )
        ]
    )
    
    fig.update_layout(
        title_text='SVD Precision', 
        title_x=0.5,
        xaxis_title='Rank', 
        yaxis_title='Precision',
        width=1485,
        height=450,
    )
    fig.update_yaxes(range=[0.5, 1])
    fig.write_image(SVD_PRECISION_FILENAME)
    fig.show()
In [10]:
# Calculating svd reconstruction precision
svd_precision(svd_S)

Full Reconstruction & Lower Rank Reconstruction¶

In [11]:
# full reconstruction - matrix svd_Ar
svd_Ar = np.dot(svd_U * svd_S, svd_V)
print(f'Diff: {np.mean(np.abs(svd_A - svd_Ar))}')

# lower rank reconstruction - matrix svd_Ar
k = 5
svd_Ar = np.dot(svd_U[:,:k] * svd_S[:k], svd_V[:k, :])

print(f'Diff reconstruction: {np.mean(np.abs(svd_A - svd_Ar))}')
Diff: 4.785856615762843e-14
Diff reconstruction: 0.9695606734152513

Average SVD Error¶

In [12]:
# function to calculate and plot average error of svd for k=n
def svd_average_error(svd_A, svd_Ar, k, unique_towns):
    SVD_AVG_ERR_FILENAME = f'{SVD_FOLDER_PATH}/{PREFIX_STR}_svd_avg_err.png'
    remove_file_if_exists(SVD_AVG_ERR_FILENAME)

    svd_err = np.average(np.abs(svd_A - svd_Ar), axis=0)
    asix_range = np.arange(0, len(unique_towns))

    fig = go.Figure(
        data=[
                go.Bar(
                    x=asix_range,
                    y=svd_err
                )
        ]
    )

    fig.update_layout(
        title_text='SVD Average Error', title_x=0.5,
        yaxis_title=f'Average error of reconstruction with rank k={k}',
        xaxis=dict(tickmode='array',
            tickvals=asix_range,
            ticktext=unique_towns
        ),
        width=1485,
        height=450,
    )
    fig.update_xaxes(tickangle=90)
    fig.write_image(SVD_AVG_ERR_FILENAME)
    fig.show()
In [13]:
# Calculating average error
svd_average_error(
    svd_A=svd_A, 
    svd_Ar=svd_Ar, 
    k=k, 
    unique_towns=unique_towns
)

Dates to Concept - SVD_U¶

In [14]:
# function to plot dates to concept for k=n
def svd_dates_to_concept(k, index, svd_U, trim=False):
    SVD_DTC_FILENAME = f'{SVD_FOLDER_PATH}/{PREFIX_STR}_svd_dates_to_concept.png'
    remove_file_if_exists(SVD_DTC_FILENAME)

    fig = go.Figure()
    # if we have limited date range
    if trim == True:
        x_tmp = np.arange(0, len(index))
        for i in range(k):
            fig.add_trace(go.Scatter(
                x=x_tmp,
                y=svd_U[:, i], name=f'k={i}'
            ))
        fig.update_layout(
            xaxis=dict(
                showticklabels=False,
            ),
            title=dict(
                text='Dates to Concept - Ploted as Continous Function', 
                x=0.5,
            )
        )
    # if we have full date range
    else:
        for i in range(k):
            fig.add_trace(go.Scatter(
                x=index,
                y=svd_U[:, i], name=f'k={i}'
            ))
            fig.update_layout(
                title=dict(
                    text='Dates to Concept', 
                    x=0.5,
                )
        )
         
    fig.write_image(SVD_DTC_FILENAME)
    
    fig.update_layout(
        xaxis=dict(
            rangeslider=dict(visible=True),
        ),
        width=1485,
        height=450,
    )
    fig.show()
In [15]:
# dates to concept
svd_dates_to_concept(
    k=k, 
    index=df_svd.index, 
    svd_U=svd_U,
)
In [16]:
if 'months-' in PREFIX_STR or 'years-' in PREFIX_STR:
    svd_dates_to_concept(
    k=k, 
    index=df_svd.index, 
    svd_U=svd_U,
    trim=True
)

Towns to Concept - SVD_V¶

In [17]:
# function to plot towns to concept for k=n
def svd_towns_to_concept(k, svd_V, unique_towns, type='bar'):
    SVD_TTC_FILENAME = f'{SVD_FOLDER_PATH}/{PREFIX_STR}_svd_towns_to_concept.png'
    remove_file_if_exists(SVD_TTC_FILENAME)
    
    asix_range = np.arange(0, len(unique_towns))
    all_plots = []
    for i in range(k):
        if type == 'bar':
            all_plots.append(go.Bar(x=asix_range, y=svd_V[i, :], name=f'{i}'))
        elif type == 'lines':
            all_plots.append(go.Scatter(mode='lines', x=asix_range, y=svd_V[i, :], name=f'{i}'))
        
    fig = go.Figure(data=all_plots)
    
    fig.update_layout(
        title_text=f'Towns to Concept - {type.capitalize()}', 
        title_x=0.5,
        xaxis=dict(
                tickmode='array',
                tickvals=asix_range,
                ticktext=unique_towns
        ),
        width=1485,
        height=450,
    )
    
    fig.update_xaxes(tickangle=90)
    fig.write_image(SVD_TTC_FILENAME)
    fig.show()
In [18]:
# towns to concept
svd_towns_to_concept(
    k=k,
    svd_V=svd_V, 
    unique_towns=unique_towns
)
In [19]:
# towns to concept
svd_towns_to_concept(
    k=k,
    svd_V=svd_V, 
    unique_towns=unique_towns,
    type='lines'
)

Singular Vector - SVD_S¶

In [20]:
'; '.join(map(str, svd_S))
Out[20]:
'75803.89607117951; 4053.8215361371404; 2387.6352966807217; 1652.0914741073727; 1239.8196238390885; 1127.3209703086852; 825.9239336358328; 664.902666553453; 571.2074230472492; 497.99171093302664; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 7.511668529219464e-12; 5.8144165497903685e-12'

SVD Maps¶

In [21]:
# plot map with values from SVD_V (towns to concept)
def plot_svd_map(vector, k, data_geo):
    SVD_MAP_FILENAME = f'{SVD_FOLDER_PATH}/{PREFIX_STR}_svd_map_k{k}.png'
    remove_file_if_exists(SVD_MAP_FILENAME)
    
    data_geo['VALUES'] = vector
    px.set_mapbox_access_token(open(".mapbox_token").read())
    
    fig = px.scatter_mapbox(
        data_geo,
        size = [2] * len(data_geo.index), 
        lat="LAT", 
        lon="LNG", 
        color="VALUES",
        hover_name="CITY",
        color_continuous_scale=px.colors.cyclical.Phase,
    )
    
    fig.update_layout(
        width=1485,
        height = 700,
        margin = {
            'l':5,
            'r':5,
            't':5,
            'b':5,
        },
        autosize=True,
        mapbox = {
            'style': "open-street-map",
            'zoom': 7.5
        }
    )
    fig.write_image(SVD_MAP_FILENAME)
    fig.show()
In [22]:
# plot maps
for i in range(k):
    print(f'Ploting map for k = {i}')
    plot_svd_map(
        vector=svd_V[i, :], 
        k=i, 
        data_geo=df_geolocation.copy()
    )
Ploting map for k = 0
Ploting map for k = 1
Ploting map for k = 2
Ploting map for k = 3
Ploting map for k = 4

Export to HTML¶

In [23]:
# save notebook before nbconvert
import IPython
import time

# wait few sec for plot above to finish
time.sleep(3)
In [24]:
%%javascript
IPython.notebook.save_notebook()
In [25]:
# export notebook results to HTML
jupyter_out_filename = f'{PREFIX_STR}_svd'
!jupyter nbconvert --output-dir 'output' --output {jupyter_out_filename} --to=HTML svd.ipynb
!jupyter nbconvert --output-dir 'output' --output {jupyter_out_filename} --to=pdf svd.ipynb

jupyter_out_filename_no_code = f'{PREFIX_STR}_svd_no_code'
!jupyter nbconvert --output-dir 'output' --output {jupyter_out_filename_no_code} --no-input --to=HTML svd.ipynb
!jupyter nbconvert --output-dir 'output' --output {jupyter_out_filename_no_code} --no-input --to=pdf svd.ipynb